We’re going to look at Instacart data.
# Load datasets
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(p8105.datasets)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
data(instacart)
instacart =
instacart %>%
janitor::clean_names() %>%
filter(department %in% c("produce", "frozen", "dairy eggs", "pantry", "household")) %>%
filter(order_hour_of_day >= 8 & order_hour_of_day <= 22) %>%
select(department, reordered, order_hour_of_day, days_since_prior_order, order_dow, add_to_cart_order)
to examine if there’s a relationship between the time since a customer’s last order and the likelihood of reordering.
instacart %>%
plot_ly(x = ~days_since_prior_order,
y = ~reordered,
type = 'scatter',
mode = 'markers',
marker = list(opacity = 0.6, size = 5)) %>%
layout(title = "Scatter Plot of Days Since Prior Order vs Reordered",
xaxis = list(title = "Days Since Prior Order"),
yaxis = list(title = "Reordered (1 = Yes, 0 = No)"))
To investigate how the add-to-cart order varies across different days of the week. Specifically, it can help answer questions like:
instacart %>%
plot_ly(x = ~order_dow,
y = ~add_to_cart_order,
color = ~factor(order_dow),
type = 'box',
colors = "viridis") %>%
layout(title = "Distribution of Add-to-Cart Order by Day of the Week",
xaxis = list(title = "Day of the Week"),
yaxis = list(title = "Add to Cart Order"))
To investigate which departments tend to have shorter or longer gaps between orders. This could reveal if certain types of products are purchased more frequently.
barplot <- instacart %>%
group_by(department) %>%
summarize(avg_days_since_prior_order = mean(days_since_prior_order, na.rm = TRUE))
barplot %>%
plot_ly(x = ~department,
y = ~avg_days_since_prior_order,
type = 'bar',
color = ~department) %>%
layout(title = "Average Days Since Prior Order by Department",
xaxis = list(title = "Department"),
yaxis = list(title = "Average Days Since Prior Order"))